library(ggplot2)
library(dplyr)
library(tidyr)
library("ggpubr")
library(LDATS)
library(stringr)

source("utils.R")

Visualization on CIFAR100. We are using data of three neural networks trained on reduced CIFAR100 training set. Half of the CIFAR100 training set was extracted as a validation set. We then divided both the reduced training set and validation set into 5 disjoint subsets and trained an ensemble on each of them. This was done in 10 replications, each time with random split of the training set into validation and new training set. In this visualization, we are trying to inspect the outputs deeper, mainly to make sense of strange behavior of nll metric for ensemble outputs.

base_dir <- "../data/data_train_val_half_c100"
repls <- 0:9
folds <- 0:4
classes <- 100

nets_outputs <- load_network_outputs(base_dir, repls)
ens_outputs <- load_ensemble_outputs(base_dir, repls, folds)
net_results <- read.csv(file.path(base_dir, "net_accuracies.csv"))
ens_results <- read.csv(file.path(base_dir, "ensemble_accuracies.csv"))
preds <- nets_outputs$test_outputs
for (ri in repls + 1)
{
  for (net_i in seq_along(nets_outputs[["networks"]]))
  {
    preds[ri, net_i, ,] <- softmax(preds[ri, net_i, , ])
  }
}
nets_test_cor_probs <- gather(preds, 1 + nets_outputs$test_labels[1, ], 3, 4)
nets_test_cor_probs <- melt(nets_test_cor_probs)
nets_test_cor_probs <- nets_test_cor_probs[, c(-3, -4)]
names(nets_test_cor_probs) <- c("replication", "network", "prediction")
nets_test_cor_probs$network <- as.factor(nets_test_cor_probs$network)
levels(nets_test_cor_probs$network) <- nets_outputs$networks
nets_cor_preds_histo <- ggplot(data=nets_test_cor_probs) + geom_histogram(mapping=aes(x=prediction), binwidth=0.01) + facet_wrap(~network) + scale_y_log10()
nets_cor_preds_histo

val_ens_cor_probs <- gather(ens_outputs$val_training, 1 + nets_outputs$test_labels[1, ], 4, 5)
val_ens_cor_probs <- melt(val_ens_cor_probs)
val_ens_cor_probs <- val_ens_cor_probs[, c(-4, -5)]
names(val_ens_cor_probs) <- c("replication", "method", "fold", "prediction")
val_ens_cor_probs$method <- as.factor(val_ens_cor_probs$method)
levels(val_ens_cor_probs$method) <- ens_outputs$methods
val_ens_cor_preds_histo <- ggplot(data=val_ens_cor_probs) + geom_histogram(mapping=aes(x=prediction), binwidth=0.01) + facet_wrap(~method) + scale_y_log10() + ggtitle("Probabilities predicted for the correct class - ens trained on val")
val_ens_cor_preds_histo

val_ens_zero_counts <- ggplot(data=val_ens_cor_probs[val_ens_cor_probs$prediction <= 0, ]) + geom_histogram(mapping=aes(x=method), stat="count")
Warning: Ignoring unknown parameters: binwidth, bins, pad
val_ens_zero_counts

val_ens_nll <- ggplot(data=ens_results) + geom_boxplot(mapping=aes(x=method, y=nll)) + facet_wrap(~train_set)
val_ens_nll

train_ens_cor_probs <- gather(ens_outputs$train_training, 1 + nets_outputs$test_labels[1, ], 4, 5)
train_ens_cor_probs <- melt(train_ens_cor_probs)
train_ens_cor_probs <- train_ens_cor_probs[, c(-4, -5)]
names(train_ens_cor_probs) <- c("replication", "method", "fold", "prediction")
train_ens_cor_probs$method <- as.factor(train_ens_cor_probs$method)
levels(train_ens_cor_probs$method) <- ens_outputs$methods
train_ens_cor_preds_histo <- ggplot(data=train_ens_cor_probs) + geom_histogram(mapping=aes(x=prediction), binwidth=0.01) + facet_wrap(~method) + scale_y_log10() + ggtitle("Probabilities predicted for the correct class - ens trained on train")
train_ens_cor_preds_histo
Warning: Transformation introduced infinite values in continuous y-axis
Warning: Removed 64 rows containing missing values (geom_bar).

train_ens_zero_counts <- ggplot(data=train_ens_cor_probs[train_ens_cor_probs$prediction <= 0, ]) + geom_histogram(mapping=aes(x=method), stat="count")
Warning: Ignoring unknown parameters: binwidth, bins, pad
train_ens_zero_counts

val_ens_cor_probs$train_type <- "vt"
train_ens_cor_probs$train_type <- "tt"
ens_cor_probs <- rbind(val_ens_cor_probs, train_ens_cor_probs)
ens_cor_preds_histo <- ggplot(data=ens_cor_probs) + geom_histogram(mapping=aes(x=prediction), binwidth=0.01) + facet_grid(rows=vars(method), cols=vars(train_type)) + scale_y_log10() + ggtitle("Probabilities predicted for the correct class")
ens_cor_preds_histo
Warning: Transformation introduced infinite values in continuous y-axis
Warning: Removed 64 rows containing missing values (geom_bar).

Very strange behavior for the bc method trained on training set. Needs further attention.

val_aggr_Rs <- np$load(file.path(base_dir, "val_training_class_aggr_R.npy"))
train_aggr_Rs <- np$load(file.path(base_dir, "train_training_class_aggr_R.npy"))
df_val_aggr_Rs <- melt(val_aggr_Rs)
names(df_val_aggr_Rs) <- c("precision", "class", "class1", "class2", "prob")
df_train_aggr_Rs <- melt(train_aggr_Rs)
names(df_train_aggr_Rs) <- c("precision", "class", "class1", "class2", "prob")
df_val_aggr_Rs$train_type <- "val_training"
df_train_aggr_Rs$train_type <- "train_training"
df_aggr_Rs <- rbind(df_val_aggr_Rs, df_train_aggr_Rs)
df_aggr_Rs[, c("class", "class1", "class2")] <- lapply(df_aggr_Rs[, c("class", "class1", "class2")], as.factor)

df_aggr_Rs_diff <- df_aggr_Rs %>% pivot_wider(names_from = train_type, values_from = prob) %>% mutate(val_min_train = val_training - train_training)
for (cls in 1:classes)
{
  cur_class_Rs <- df_aggr_Rs %>% filter(class == cls)
  plot_cls <-  ggplot(cur_class_Rs, aes(x = class2, y = class1)) + 
    geom_raster(aes(fill=prob)) + 
    facet_wrap(~train_type) +
    scale_fill_gradient(low="grey90", high="red") +
    scale_y_discrete(limits=rev, breaks=seq(0, classes, 10)) +
    scale_x_discrete(breaks=seq(0, classes, 10)) +
    labs(x="class 2", y="class 1", title=paste("Pairwise probabilities - class ", cls)) +
    theme_bw()
  
  print(plot_cls)
}

Difference between these two LDA training methodologies are not well visible, so we will plot just the differences.

for (cls in 1:classes)
{
  cur_class_Rs <- df_aggr_Rs_diff %>% filter(class == cls)
  plot_cls <-  ggplot(cur_class_Rs, aes(x = class2, y = class1)) + 
    geom_raster(aes(fill=val_min_train)) + 
    scale_fill_gradient2(low="blue", high="red", mid="white", midpoint=0) +
    scale_y_discrete(limits=rev, breaks=seq(0, classes, 10)) +
    scale_x_discrete(breaks=seq(0, classes, 10)) +
    labs(x="class 2", y="class 1", title=paste("Pairwise probabilities differences - class ", cls)) +
    theme_bw()
  
  print(plot_cls)
}

lda_coefs <- load_lda_coefs(base_dir, repls, folds)
for (cl1 in 1:19)
{
  for (cl2 in (cl1 + 1):20)
  {
    cur_plt <- lda_coefs %>% filter(class1 == cl1 & class2 == cl2) %>% ggplot() + geom_boxplot(aes(x=coefficient, y=value)) +
      facet_wrap(~train_type) + ggtitle(paste("Coefficients for class", cl1, "vs", cl2))
    print(cur_plt)
  }
}

Coefficients of LDA trained on validation set have lower variance and more similar values across the networks.

For validation set trained LDAs, red - densenet seems to be dominant. On the other hand for train set trained LDAs blue - xception has higher values.

avg_lda_coefs <- lda_coefs %>% filter(coefficient != "interc") %>% group_by(class1, class2, precision, train_type, coefficient) %>% summarise( value = mean(value)) %>% ungroup()
`summarise()` has grouped output by 'class1', 'class2', 'precision', 'train_type'. You can override using the `.groups` argument.
avg_lda_coefs_vt <- avg_lda_coefs %>% filter(train_type=="val_training")
avg_lda_coefs_tt <- avg_lda_coefs %>% filter(train_type=="train_training")
avg_lda_coefs_vt$value <- avg_lda_coefs_vt$value - min(avg_lda_coefs_vt$value)
avg_lda_coefs_vt$value <- avg_lda_coefs_vt$value / max(avg_lda_coefs_vt$value)
avg_lda_coefs_tt$value <- avg_lda_coefs_tt$value - min(avg_lda_coefs_tt$value)
avg_lda_coefs_tt$value <- avg_lda_coefs_tt$value / max(avg_lda_coefs_tt$value)
avg_lda_coefs <- rbind(avg_lda_coefs_vt, avg_lda_coefs_tt)
avg_lda_c_w <- pivot_wider(avg_lda_coefs, names_from = coefficient, values_from = value)
avg_lda_c_w[, c("class1", "class2")] <- lapply(avg_lda_c_w[, c("class1", "class2")], as.factor)
avg_lda_c_w$top_net <- factor(c("densenet121", "resnet34", "xception")[max.col(as.matrix(avg_lda_c_w[, c("densenet121", "resnet34", "xception")]))])
raster_plot <- ggplot(avg_lda_c_w) + 
  geom_tile(aes(x=class2, y=class1, fill=rgb(densenet121, resnet34, xception))) +
  scale_y_discrete(limits=rev, breaks=seq(0,classes, 10)) + scale_x_discrete(breaks=seq(0,classes, 10)) + scale_fill_identity() + facet_wrap(~train_type)
raster_plot

For validation set trained LDAs, red - densenet seems to be dominant. On the other hand for train set trained LDAs blue - xception has higher values.

coefs_grid <- ggplot(avg_lda_c_w, aes(x=class2, y=class1, fill=top_net)) + 
  geom_raster() + 
  scale_fill_brewer(type="qual") +
  facet_wrap(~train_type) +
  scale_y_discrete(breaks=seq(0, classes, 10), limits=rev) +
  scale_x_discrete(breaks=seq(0, classes, 10)) +
  guides(fill=guide_legend(title="Network")) +
  xlab("Class") + 
  ylab("Class") +
  ggtitle("Network with highest lda weight for class pairs") +
  theme(plot.title = element_text(hjust = 0.5),
        axis.ticks = element_blank(),
        panel.grid.major = element_blank(), 
        panel.grid.minor = element_blank())

coefs_grid

LDAs trained on nn train set seems to be dominated by xception. LDAs trained on validation set by densenet.

---
title: "Outputs inspection half CIFAR100"
output:
  pdf_document: default
  html_notebook: default
---

```{r}
library(ggplot2)
library(dplyr)
library(tidyr)
library("ggpubr")
library(LDATS)
library(stringr)

source("utils.R")
```

Visualization on CIFAR100.
We are using data of three neural networks trained on reduced CIFAR100 training set. Half of the CIFAR100 training set was extracted as a validation set. We then divided both the reduced training set and validation set into 5 disjoint subsets and trained an ensemble on each of them. This was done in 10 replications, each time with random split of the training set into validation and new training set.
In this visualization, we are trying to inspect the outputs deeper, mainly to make sense of strange behavior of nll metric for ensemble outputs.

```{r}
base_dir <- "../data/data_train_val_half_c100"
repls <- 0:9
folds <- 0:4
classes <- 100

nets_outputs <- load_network_outputs(base_dir, repls)
ens_outputs <- load_ensemble_outputs(base_dir, repls, folds)
net_results <- read.csv(file.path(base_dir, "net_accuracies.csv"))
ens_results <- read.csv(file.path(base_dir, "ensemble_accuracies.csv"))
```


```{r}
preds <- nets_outputs$test_outputs
for (ri in repls + 1)
{
  for (net_i in seq_along(nets_outputs[["networks"]]))
  {
    preds[ri, net_i, ,] <- softmax(preds[ri, net_i, , ])
  }
}
nets_test_cor_probs <- gather(preds, 1 + nets_outputs$test_labels[1, ], 3, 4)
nets_test_cor_probs <- melt(nets_test_cor_probs)
nets_test_cor_probs <- nets_test_cor_probs[, c(-3, -4)]
names(nets_test_cor_probs) <- c("replication", "network", "prediction")
nets_test_cor_probs$network <- as.factor(nets_test_cor_probs$network)
levels(nets_test_cor_probs$network) <- nets_outputs$networks
```

```{r}
nets_cor_preds_histo <- ggplot(data=nets_test_cor_probs) + geom_histogram(mapping=aes(x=prediction), binwidth=0.01) + facet_wrap(~network) + scale_y_log10()
nets_cor_preds_histo
```


```{r}
val_ens_cor_probs <- gather(ens_outputs$val_training, 1 + nets_outputs$test_labels[1, ], 4, 5)
val_ens_cor_probs <- melt(val_ens_cor_probs)
val_ens_cor_probs <- val_ens_cor_probs[, c(-4, -5)]
names(val_ens_cor_probs) <- c("replication", "method", "fold", "prediction")
val_ens_cor_probs$method <- as.factor(val_ens_cor_probs$method)
levels(val_ens_cor_probs$method) <- ens_outputs$methods
```

```{r}
val_ens_cor_preds_histo <- ggplot(data=val_ens_cor_probs) + geom_histogram(mapping=aes(x=prediction), binwidth=0.01) + facet_wrap(~method) + scale_y_log10() + ggtitle("Probabilities predicted for the correct class - ens trained on val")
val_ens_cor_preds_histo
```

```{r}
val_ens_zero_counts <- ggplot(data=val_ens_cor_probs[val_ens_cor_probs$prediction <= 0, ]) + geom_histogram(mapping=aes(x=method), stat="count")
val_ens_zero_counts
```

```{r}
val_ens_nll <- ggplot(data=ens_results) + geom_boxplot(mapping=aes(x=method, y=nll)) + facet_wrap(~train_set)
val_ens_nll
```


```{r}
train_ens_cor_probs <- gather(ens_outputs$train_training, 1 + nets_outputs$test_labels[1, ], 4, 5)
train_ens_cor_probs <- melt(train_ens_cor_probs)
train_ens_cor_probs <- train_ens_cor_probs[, c(-4, -5)]
names(train_ens_cor_probs) <- c("replication", "method", "fold", "prediction")
train_ens_cor_probs$method <- as.factor(train_ens_cor_probs$method)
levels(train_ens_cor_probs$method) <- ens_outputs$methods
```


```{r}
train_ens_cor_preds_histo <- ggplot(data=train_ens_cor_probs) + geom_histogram(mapping=aes(x=prediction), binwidth=0.01) + facet_wrap(~method) + scale_y_log10() + ggtitle("Probabilities predicted for the correct class - ens trained on train")
train_ens_cor_preds_histo
```


```{r}
train_ens_zero_counts <- ggplot(data=train_ens_cor_probs[train_ens_cor_probs$prediction <= 0, ]) + geom_histogram(mapping=aes(x=method), stat="count")
train_ens_zero_counts
```

```{r}
val_ens_cor_probs$train_type <- "vt"
train_ens_cor_probs$train_type <- "tt"
ens_cor_probs <- rbind(val_ens_cor_probs, train_ens_cor_probs)
```

```{r}
ens_cor_preds_histo <- ggplot(data=ens_cor_probs) + geom_histogram(mapping=aes(x=prediction), binwidth=0.01) + facet_grid(rows=vars(method), cols=vars(train_type)) + scale_y_log10() + ggtitle("Probabilities predicted for the correct class")
ens_cor_preds_histo
```
Very strange behavior for the bc method trained on training set. Needs further attention.

```{r}
val_aggr_Rs <- np$load(file.path(base_dir, "val_training_class_aggr_R.npy"))
train_aggr_Rs <- np$load(file.path(base_dir, "train_training_class_aggr_R.npy"))
df_val_aggr_Rs <- melt(val_aggr_Rs)
names(df_val_aggr_Rs) <- c("precision", "class", "class1", "class2", "prob")
df_train_aggr_Rs <- melt(train_aggr_Rs)
names(df_train_aggr_Rs) <- c("precision", "class", "class1", "class2", "prob")
df_val_aggr_Rs$train_type <- "val_training"
df_train_aggr_Rs$train_type <- "train_training"
df_aggr_Rs <- rbind(df_val_aggr_Rs, df_train_aggr_Rs)
df_aggr_Rs[, c("class", "class1", "class2")] <- lapply(df_aggr_Rs[, c("class", "class1", "class2")], as.factor)

df_aggr_Rs_diff <- df_aggr_Rs %>% pivot_wider(names_from = train_type, values_from = prob) %>% mutate(val_min_train = val_training - train_training)
```

```{r}
for (cls in 1:classes)
{
  cur_class_Rs <- df_aggr_Rs %>% filter(class == cls)
  plot_cls <-  ggplot(cur_class_Rs, aes(x = class2, y = class1)) + 
    geom_raster(aes(fill=prob)) + 
    facet_wrap(~train_type) +
    scale_fill_gradient(low="grey90", high="red") +
    scale_y_discrete(limits=rev, breaks=seq(0, classes, 10)) +
    scale_x_discrete(breaks=seq(0, classes, 10)) +
    labs(x="class 2", y="class 1", title=paste("Pairwise probabilities - class ", cls)) +
    theme_bw()
  
  print(plot_cls)
}
```

Difference between these two LDA training methodologies are not well visible, so we will plot just the differences.

```{r}
for (cls in 1:classes)
{
  cur_class_Rs <- df_aggr_Rs_diff %>% filter(class == cls)
  plot_cls <-  ggplot(cur_class_Rs, aes(x = class2, y = class1)) + 
    geom_raster(aes(fill=val_min_train)) + 
    scale_fill_gradient2(low="blue", high="red", mid="white", midpoint=0) +
    scale_y_discrete(limits=rev, breaks=seq(0, classes, 10)) +
    scale_x_discrete(breaks=seq(0, classes, 10)) +
    labs(x="class 2", y="class 1", title=paste("Pairwise probabilities differences - class ", cls)) +
    theme_bw()
  
  print(plot_cls)
}
```



```{r}
lda_coefs <- load_lda_coefs(base_dir, repls, folds)
```
```{r}
for (cl1 in 1:19)
{
  for (cl2 in (cl1 + 1):20)
  {
    cur_plt <- lda_coefs %>% filter(class1 == cl1 & class2 == cl2) %>% ggplot() + geom_boxplot(aes(x=coefficient, y=value)) +
      facet_wrap(~train_type) + ggtitle(paste("Coefficients for class", cl1, "vs", cl2))
    print(cur_plt)
  }
}
```
Coefficients of LDA trained on validation set have lower variance and more similar values across the networks.


For validation set trained LDAs, red - densenet seems to be dominant. On the other hand for train set trained LDAs blue - xception has higher values.

```{r}
avg_lda_coefs <- lda_coefs %>% filter(coefficient != "interc") %>% group_by(class1, class2, precision, train_type, coefficient) %>% summarise( value = mean(value)) %>% ungroup()
avg_lda_coefs_vt <- avg_lda_coefs %>% filter(train_type=="val_training")
avg_lda_coefs_tt <- avg_lda_coefs %>% filter(train_type=="train_training")
avg_lda_coefs_vt$value <- avg_lda_coefs_vt$value - min(avg_lda_coefs_vt$value)
avg_lda_coefs_vt$value <- avg_lda_coefs_vt$value / max(avg_lda_coefs_vt$value)
avg_lda_coefs_tt$value <- avg_lda_coefs_tt$value - min(avg_lda_coefs_tt$value)
avg_lda_coefs_tt$value <- avg_lda_coefs_tt$value / max(avg_lda_coefs_tt$value)
avg_lda_coefs <- rbind(avg_lda_coefs_vt, avg_lda_coefs_tt)
avg_lda_c_w <- pivot_wider(avg_lda_coefs, names_from = coefficient, values_from = value)
avg_lda_c_w[, c("class1", "class2")] <- lapply(avg_lda_c_w[, c("class1", "class2")], as.factor)
avg_lda_c_w$top_net <- factor(c("densenet121", "resnet34", "xception")[max.col(as.matrix(avg_lda_c_w[, c("densenet121", "resnet34", "xception")]))])
```

```{r}
raster_plot <- ggplot(avg_lda_c_w) + 
  geom_tile(aes(x=class2, y=class1, fill=rgb(densenet121, resnet34, xception))) +
  scale_y_discrete(limits=rev, breaks=seq(0,classes, 10)) + scale_x_discrete(breaks=seq(0,classes, 10)) + scale_fill_identity() + facet_wrap(~train_type) + ggtitle("RGB image formed from lda coefficients for networks densenet, resnet, xception")
raster_plot
```
For validation set trained LDAs, red - densenet seems to be dominant. On the other hand for train set trained LDAs blue - xception has higher values.

```{r}
coefs_grid <- ggplot(avg_lda_c_w, aes(x=class2, y=class1, fill=top_net)) + 
  geom_raster() + 
  scale_fill_brewer(type="qual") +
  facet_wrap(~train_type) +
  scale_y_discrete(breaks=seq(0, classes, 10), limits=rev) +
  scale_x_discrete(breaks=seq(0, classes, 10)) +
  guides(fill=guide_legend(title="Network")) +
  xlab("Class") + 
  ylab("Class") +
  ggtitle("Network with highest lda weight for class pairs") +
  theme(plot.title = element_text(hjust = 0.5),
        axis.ticks = element_blank(),
        panel.grid.major = element_blank(), 
        panel.grid.minor = element_blank())

coefs_grid
```

LDAs trained on nn train set seems to be dominated by xception. LDAs trained on validation set by densenet.
